import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
px.set_mapbox_access_token(open("./mapbox_token").read())
# pip install plotly_express==0.4.0
from bs4 import BeautifulSoup
from warnings import filterwarnings
filterwarnings('ignore')
from datetime import datetime
# Data Source
# https://www.kaggle.com/kimjihoo/coronavirusdataset?select=SeoulFloating.csv
from sklearn.model_selection import train_test_split
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
# Matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
# Diagram
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image
Enable Table Of Content
pip install jupyter_contrib_nbextensions jupyter contrib nbextension install --user jupyter nbextension enable spellchecker/main jupyter nbextension enable codefolding/main
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap = caseDataForMap[caseDataForMap[['latitude', 'longitude']] != '-']
cols = ['latitude', 'longitude']
caseData[cols] = caseData[cols].apply(pd.to_numeric, errors='coerce', axis=1)
display(caseData.head())
fig = px.scatter_mapbox(
caseData[caseData.latitude != '-'],
text="<br>City: " + caseData["city"] +" <br>Province: " + caseData["province"],
lat="latitude",
lon="longitude",
color="confirmed",
size="confirmed",
color_continuous_scale=px.colors.sequential.Burg,
size_max=100,
mapbox_style='dark',
zoom=6,
title="Overview of the cases in Korea")
fig.show()
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataSorted = caseDataForMap.sort_values(by=['confirmed'], ascending=False)
display(caseDataSorted.head())
sortedValues = caseDataForMap.groupby(['province','group']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues = sortedValues[['province','confirmed','group']]
display(sortedValues.head())
sortedValues['logConfirmed'] = np.log(sortedValues['confirmed'])
chart = sns.factorplot(x='confirmed', y='province', hue='group',data=sortedValues, kind='bar', height=6.27, aspect=17.7/8.27)
chart.set_xticklabels(rotation=90)
#Used log, because daegu has high amount of infection which highly skewed the data
caseData['logConfirmed'] = np.log(caseData['confirmed'])
locations = caseData.pivot("province", "infection_case", "logConfirmed")
display(locations.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(locations, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Numbers of cases sort by source and province')
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap['logConfirmed'] = np.log(caseDataForMap['confirmed'])
a4_dims = (14.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
chart = sns.boxplot(x='logConfirmed', y='province',data=caseDataForMap, ax=ax)
caseDataForMap = caseData.copy()
sortedValues = caseDataForMap.groupby(['province']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues['logConfirmed'] = np.log(sortedValues['confirmed'])
sortedValues = sortedValues[['province','confirmed','logConfirmed']]
display(sortedValues.head())
print('Mean: ', sortedValues.mean().values)
print('Standard Derivation: ', sortedValues.std().values)
This amount of COVID-19 cases has a mean of 670 and a standard deviation of 1610.
Hence we sort the provinces into 4 different types of level:
def calculate_risk_level(sortedValues) :
if sortedValues['confirmed'] >= 2000:
return 4
elif sortedValues['confirmed'] >= 1000 and sortedValues['confirmed'] < 2000:
return 3
elif sortedValues['confirmed'] >= 100 and sortedValues['confirmed'] < 900:
return 2
else:
return 1
sortedValues['Risk Level'] = sortedValues.apply(calculate_risk_level,axis=1);
sortedValues = sortedValues[['province','Risk Level']]
display(sortedValues.head())
patientData = pd.read_csv('covid/patientinfo.csv')
display(patientData.head())
groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']
fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.show()
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['released_date'].notnull()]
cols = ['released_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['released_date'] - confinedDaysAnalysis['confirmed_date']).dt.days
groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]
dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by Age & Gender')
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].dropna()
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].map(age)
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'].notna()]
fig = px.scatter(confinedDaysAnalysis, x="age", y="Total Treatment Days", color="sex", trendline="ols", title="Does age group affect total treatment days?")
fig.show()
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))
dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by City & Age')
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['deceased_date'].notnull()]
cols = ['deceased_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['deceased_date'] - confinedDaysAnalysis['confirmed_date']).dt.days
display(confinedDaysAnalysis.head())
groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]
dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by Age & Gender')
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))
dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by City & Age')
survivorCountAnalysis = patientData.copy()
survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()
provinceStats = survivorCountAnalysis.groupby(['province']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]
provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]
display(provinceStatsAbsolute.head())
display(provinceStatsPercentage.head())
newTable = provinceStatsPercentage.reset_index()
display(newTable.head())
newTable = pd.melt(newTable, id_vars="province", var_name="stats", value_name="rate")
chart = sns.factorplot(x='rate', y='province', hue='stats', data=newTable, kind='bar', height=6.27, aspect=16.7/8.27)
chart.set_xticklabels(rotation=90)
survivorCountAnalysis = patientData.copy()
survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()
provinceStats = survivorCountAnalysis.groupby(['sex']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]
provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]
display(provinceStatsAbsolute)
display(provinceStatsPercentage)
newTable = provinceStatsPercentage.reset_index()
newTable = pd.melt(newTable, id_vars="sex", var_name="stats", value_name="rate")
display(newTable.head())
chart = sns.factorplot(x='rate', y='sex', hue='stats', data=newTable, kind='bar', height=6.27, aspect=17.7/8.27)
chart.set_xticklabels(rotation=90)
networkData = patientData.copy()
networkData = networkData[networkData['infected_by'].notnull()]
networkData = networkData[['patient_id','sex','age','province','city','infection_case','infected_by','state']]
display(networkData.head())
A = list(networkData["infected_by"].unique())
B = list(networkData["patient_id"].unique())
node_list = set(A+B)
# Create Graph
G = nx.Graph()
for i in node_list:
G.add_node(i)
# G.nodes()
for i,j in networkData.iterrows():
G.add_edges_from([(j["infected_by"],j["patient_id"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdPu_r',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Numnber of Infected Cases',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info = str(adjacencies[0]) +' # of connections: '+ str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='<br>Korea Covid Network Connections',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
policyData = pd.read_csv('covid/policy.csv')
policyDataCopy = policyData.copy()
policyDataCopy['end_date'] = policyDataCopy['end_date'].fillna(datetime.now().strftime('%Y-%m-%d'))
display(policyDataCopy.head())
df = []
for index, row in policyDataCopy.iterrows():
df.append(dict(Task=row['gov_policy'], Start=row['start_date'], Finish=row['end_date'], Resource=row['type']))
fig = px.timeline(df, x_start="Start", x_end="Finish", y="Task", color="Resource", title="Policy over time")
fig.show()
patientData = pd.read_csv('covid/patientinfo.csv')
groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']
fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.show()
timeData = pd.read_csv('covid/time.csv')
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['test', 'negative','confirmed','released','deceased'])
display(timeDataMelted.head())
fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Overall cases over time")
fig.show()
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['confirmed','released','deceased'])
display(timeDataMelted.head())
fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Confirmed, released & deceased over time")
fig.show()
timeAgeData = pd.read_csv('covid/timeage.csv')
display(timeAgeData.head())
fig = px.line(timeAgeData, x="date", y="confirmed", color='age', title="Confirmed cases of various age group over time")
fig.show()
timeAgeData = pd.read_csv('covid/timeage.csv')
fig = px.line(timeAgeData, x="date", y="deceased", color='age', title="Deceased cases of various age group over time")
fig.show()
timeGender = pd.read_csv('covid/timegender.csv')
display(timeGender.head())
fig = px.line(timeGender, x="date", y="confirmed", color='sex', title="Confirmed cases between genders over time")
fig.show()
fig = px.line(timeGender, x="date", y="deceased", color='sex', title="Deceased cases between genders over time")
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
display(timeProvince.head())
fig = px.line(timeProvince, x="date", y="confirmed", color='province', title="Confirmed cases of various province over time")
fig.show()
fig = px.line(timeProvince, x="date", y="released", color='province', title="Released cases of various province over time")
fig.show()
fig = px.line(timeProvince, x="date", y="deceased", color='province', title="Deceased cases of various province over time")
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
data = {'province': ['Seoul', 'Busan', 'Daegu', 'Incheon', 'Gwangju', 'Daejeon',
'Ulsan', 'Sejong', 'Gyeonggi-do', 'Gangwon-do',
'Chungcheongbuk-do', 'Chungcheongnam-do', 'Jeollabuk-do',
'Jeollanam-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do'],
'longitude': [127.047325,129.066666,128.600006,126.705208,126.916664,127.385002,
129.316666,127.2822,127.143738,127.920158,
127.935905,126.979874,126.916664,126.9910,
129.263885,128.429581,126.5312],
'latitude': [37.517235,35.166668,35.866669,37.456257,35.166668,36.351002,
35.549999,36.4870,37.603405,37.342220,
36.981304,36.806702,35.166668,34.8679,
35.835354,34.855228,33.4996]}
location = pd.DataFrame(data=data)
mergedData = timeProvince.merge(location, on='province', how='left')
mergedData = mergedData.dropna()
mergedData.head()
fig = px.scatter_mapbox(
mergedData, lat="latitude", lon="longitude",
size="confirmed", size_max=100,
color="deceased", color_continuous_scale=px.colors.sequential.Burg,
hover_name="province",
mapbox_style='dark', zoom=6,
animation_frame="date", animation_group="province"
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
fig = px.scatter_mapbox(
mergedData, lat="latitude", lon="longitude",
size="confirmed", size_max=100,
color="deceased", color_continuous_scale=px.colors.sequential.Burg,
hover_name="province",
mapbox_style='dark', zoom=6,
)
fig.show()
weatherData = pd.read_csv('covid/weather.csv')
weatherData = weatherData.set_index('date')
weatherData = weatherData['2020-01-01' : '2020-08-31'].reset_index()
display(weatherData.head())
weatherDataForHeatmap = weatherData.pivot("date", "province", "avg_temp")
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(weatherDataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Weather Over Time')
regionData = pd.read_csv('covid/region.csv')
pd.set_option('display.max_rows', regionData.shape[0]+1)
display(regionData.head())
fig = px.scatter_mapbox(
regionData[regionData.city != 'Korea'],
text="city",
lat="latitude",
lon="longitude",
color="elderly_population_ratio",
size="nursing_home_count",
color_continuous_scale=px.colors.sequential.Burg,
size_max=100,
zoom=6,
title="Number of nusing home and elder population ratio across Korea")
fig.show()
import requests
page = requests.get("https://www.worldometers.info/coronavirus")
page.status_code
# page.content
soup = BeautifulSoup(page.content, 'lxml')
# print(soup.prettify())
covidTable = soup.find('table', attrs={'id': 'main_table_countries_today'})
# covidTable
rows = covidTable.find_all("tr", attrs={"style": ""})
covidData = []
for i,data in enumerate(rows):
if i == 0:
covidData.append(data.text.strip().split("\n")[:13])
else:
covidData.append(data.text.strip().split("\n")[:12])
# covidData
covidTable = pd.DataFrame(covidData[1:], columns=covidData[0][:12])
covidTable = covidTable[~covidTable['#'].isin(['World', 'Total:'])]
covidTable = covidTable.drop('#', axis =1)
covidTable.head()
fig = px.bar(covidTable, y='TotalCases', x='Country,Other')
fig.show()
We will be using log because the data is highly skewed
covidTable['logTotalCases'] = np.log(covidTable['TotalCases'].str.replace(r'\D', '').astype(int))
covidTable = covidTable.sort_values(by=['logTotalCases'])
fig = px.bar(covidTable, y='logTotalCases', x='Country,Other')
fig.show()
pd.set_option('display.max_rows', covidTable.shape[0]+1)
covidTable['TotalDeaths'].str.strip()
covidTable['TotalDeaths'].replace('', np.nan, inplace=True)
covidTable['TotalDeaths'].replace(' ', np.nan, inplace=True)
covidTable['TotalDeaths'] = covidTable['TotalDeaths'].fillna(0)
covidTable['logTotalDeath'] = np.log(covidTable['TotalDeaths'].str.replace(r'\D', '').astype(float))
covidTable = covidTable.sort_values(by=['logTotalDeath'])
fig = px.bar(covidTable, y='logTotalDeath', x='Country,Other')
fig.show()
We will create a model which will predict whether a COVID-19 patient will survive given a certain set of conditions
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
display(patientModellingData.head())
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)
def calculate_number_of_treatment_days(row):
if row["released_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
return(treatmentDays)
else:
return None
patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)
patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','province','state','Treatment Days']]
display(patientModellingDataTreatment.head())
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)
display(patientModellingDataTreatment.head())
state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)
display(patientModellingDataTreatment.head())
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)
display(patientModellingDataTreatment.head())
provinceDummy = pd.get_dummies(patientModellingDataTreatment['province'])
provinceDummy
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
plt.figure(figsize=(10,5))
sns.heatmap(dataForModelling.corr())
dataForModelling['age'] = pd.to_numeric(dataForModelling['age'])
dataForModellingForCleaning = dataForModelling.copy()
display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
clean_dataset(dataForModellingForCleaning)
display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))
x = dataForModellingForCleaning.drop("state", axis=1)
y = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33, random_state=42)
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
sgd.score(X_train, Y_train)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Perceptron',
'Stochastic Gradient Decent',
'Decision Tree'],
'Score': [acc_linear_svc, acc_knn, acc_log,
acc_random_forest, acc_gaussian, acc_perceptron,
acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)
feature_cols = x.columns
dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data,
feature_names = feature_cols,
filled = True, rounded = True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree.png')
Image(graph.create_png())
# Entropy 0 == no disorder // perfect knowledge, perfect classification
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances1 = importances.sort_values('importance',ascending=False).set_index('feature')
importances1.head(15)
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.show()
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result1_train = confusion_matrix(Y_train, predictions)
display(result1_train)
p1_train = precision_score(Y_train, predictions)
r1_train = recall_score(Y_train, predictions)
print("Precision:", p1_train)
print("Recall:", r1_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result1_test = confusion_matrix(Y_test, predictions)
display(result1_test)
p1_test = precision_score(Y_test, predictions)
r1_test = recall_score(Y_test, predictions)
print("Precision:", p1_test)
print("Recall:", r1_test)
Lets observe what happened if we drop the treatment days column Will the model be more accurate?
x2 = dataForModellingForCleaning.drop(["state","Treatment Days"], axis=1)
y2 = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x2, y2, test_size=0.33, random_state=42)
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Random Forest','Decision Tree'],
'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
feature_cols = x2.columns
dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data,
feature_names = feature_cols,
filled = True, rounded = True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree2.png')
Image(graph.create_png())
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances2 = importances.sort_values('importance',ascending=False).set_index('feature')
importances2.head(15)
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.show()
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result2_train = confusion_matrix(Y_train, predictions)
display(result2_train)
p2_train = precision_score(Y_train, predictions)
r2_train = recall_score(Y_train, predictions)
print("Precision:", p2_train)
print("Recall:", r2_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result2_test = confusion_matrix(Y_test, predictions)
display(result2_test)
p2_test = precision_score(Y_test, predictions)
r2_test = recall_score(Y_test, predictions)
print("Precision:", p2_test)
print("Recall:", r2_test)
The old model is better than the new model even though it has a lower precision rate.
The number of survivor in COVID-19 is much higher than the number of deceased.
Hence, it is much harder to predict who will pass away from the infection.
The new model is unable to predict whether a patient will pass away.
The number of false negatives outweigh the number of true negatives.
On the other hand, the old model is able to predict the deceased much accurately than the new model.
With that in mind, this also indicated that the number of days of treatment is vital.
Generally, those who passed away from the infection had lesser days of treatment. However, is it right for us to conclude that lesser amount of treatment days, the higher the fatality rate? Are the risk of dying higher during first few days/weeks of treatment? Or is there something more to it?
This situation is something similar to Abraham Wald and the Missing Bullet Holes. @penguinpress summed up the situation perfectly:
You don't want your planes to get shot down by enemy fighters, so you armor them. But armor makes the plane heavier, and heavier planes are less maneuverable and use more fuel. Armoring the planes too much is a problem; armoring the planes too little is a problem. Somewhere in between there's an optimum.
When American planes came back from engagements over Europe, they were covered in bullet holes. But the damage wasn't uniformly distributed across the aircraft. There were more bullet holes in the fuselage, not so many in the engines.

At first glance, it seems reasonable to focus the armor on the fuselage. However, the armor, said Wald, doesn't go where the bullet holes are. It goes where the bullet holes aren't: on the engines.
The missing bullet holes were on the missing planes. The reason planes were coming back with fewer hits to the engine is that planes that got hit in the engine weren't coming back.
If you go to the recovery room at the hospital, you'll see a lot more people with bullet holes in their legs than people with bullet holes in their chests. But that's not because people don't get shot in the chest; it's because the people who get shot in the chest don't recover.
The link to the excellent article written by @penguinpress is in the credit section below.
In this case, our model only consists of people who have passed away after they have gone through treatment. We have excluded people who have died even before they have the opportunity to go through treatment!
Even after we have excluded those who have not undergo treatment before they passed away, our model indicated that the number of treatment days is the strongest factor among gender, location and age. Treatment days has a whooping percentage of 60% in terms of importance next to age which has a importance of 20%.
The chances of survival increases as the number of treatment days increases. Hence, the faster detection rate, the earlier the treatment, the more days and opportunities to treat the infection, the better the chances of survival. The Korea government put a very strong emphasis on COVID-19 detection programme and hence they managed to contain the COVID-19 situation.
Various cities among the world were also in lockdown to prevent the rapid spread of COVID-19. The healthcare system will be overwhelmed if COVID-19 were to spread widely which will cause the death rate to escalate as more people will have lesser opportunity to undergo treatment.
The older the person, the higher the chances of infection and death. Hence it is not advisable for elderly people to roam around unless it is necessary.
In terms of locations, the majority of the cases came from Daegu and the sources of infection is from places such as church and clubs. The higher the amount of contact, the greater the chances of infection. It is advisable for the authority to close such areas until a vaccine for COVID-19 is found. During May to June, there is an increase in COVID-19 cases due to complacency.
https://www.aa.com.tr/en/asia-pacific/s-korea-sees-mass-covid-19-cases-linked-to-night-clubs/1838031
https://www.channelnewsasia.com/news/asia/south-korea-covid-19-church-backlash-13092284
Will the model's accuracy improve if we include patient who died without any treatment?
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
deadPatientModellingData = patientData[patientData['state'] == 'deceased']
totalDiedPatient = deadPatientModellingData.count()
display(totalDiedPatient)
display('***********')
pd.set_option('display.max_rows', deadPatientModellingData.shape[0]+1)
totalDiedPaitentWithoutTreatment = deadPatientModellingData[deadPatientModellingData['deceased_date'].isnull() == True].count()
display(totalDiedPaitentWithoutTreatment)
Assuming patient who died without any released_date or deceased_date are those died without any treatment
There are about 12 patient who died without any treatment
str(12/78 * 100) + '%'
This accounts for 15% of the total death
We will simply add another line of logic to return treatment days = 0
if the patient is deceased and both released and deceased date are empty
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
display(patientModellingData.head())
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)
def calculate_number_of_treatment_days(row):
if row["released_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is pd.NaT and row["released_date"] is pd.NaT:
if row["state"] == 'deceased' and row["state"] != 'released':
return 0
else:
return None
else:
return None
patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)
patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','province','state','Treatment Days']]
display(patientModellingDataTreatment.head())
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)
state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)
display(patientModellingDataTreatment.head())
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
clean_dataset(dataForModelling)
display(dataForModelling.count())
print(np.any(np.isnan(dataForModelling)))
print(np.all(np.isfinite(dataForModelling)))
x3 = dataForModelling.drop(["state"], axis=1)
y3 = dataForModelling["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x3, y3, test_size=0.33, random_state=42)
## Random_forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
## Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Random Forest','Decision Tree'],
'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances3 = importances.sort_values('importance',ascending=False).set_index('feature')
importances3.head(15)
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result3_train = confusion_matrix(Y_train, predictions)
display(result3_train)
p3_train = precision_score(Y_train, predictions)
r3_train = recall_score(Y_train, predictions)
print("Precision:", p3_train)
print("Recall:", r3_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result3_test = confusion_matrix(Y_test, predictions)
display(result3_test)
p3_test = precision_score(Y_test, predictions)
r3_test = recall_score(Y_test, predictions)
print("Precision:", p3_test)
print("Recall:", r3_test)
print('Model with treatment day :')
print('importance:', importances1.head(3))
display(result1_train)
display(result1_test)
print('******************************')
print('Model without treatment day: ')
print('importance:', importances2.head(3))
display(result2_train)
display(result2_test)
print('******************************')
print('Model with treatment day and those who died without treatment:')
print('importance:', importances3.head(3))
display(result3_train)
display(result3_test)
print('******************************')
print('Model with treatment day :')
print("Train Data Precision:", p1_train)
print("Train Data Recall:", r1_train)
print("Test Data Precision:", p1_test)
print("Test Data Recall:", r1_test)
print('******************************')
print('Model without treatment day: ')
print("Train Data Precision:", p2_train)
print("Train Data Recall:", r2_train)
print("Test Data Precision:", p2_test)
print("Test Data Recall:", r2_test)
print('******************************')
print('Model with treatment day and those who died without treatment:')
print("Train Data Precision:", p3_train)
print("Train Data Recall:", r3_train)
print("Test Data Precision:", p3_test)
print("Test Data Recall:", r3_test)
Abraham Wald and the Missing Bullet Holes:
https://medium.com/@penguinpress/an-excerpt-from-how-not-to-be-wrong-by-jordan-ellenberg-664e708cfc3d\n
Decision Tree Diagram:
Ze Yu's sample project